#include "mtp.h"
#include "bss.h"

#define MAX_BES_LEN 37

extern int minScore;
extern int minId;
extern int max_seq_olap;
extern int maxOlap;
extern int minOlap;
extern int mult_ctg_ratio;
extern int only_pos_olap;
extern int only_single_bes;
extern int Use_sizes;

extern void add_to_pairs();
extern int filCheck();
extern int GetSize();

struct hit{
  char bes[MAX_BES_LEN+1];
  char clone[CLONE_SZ+1];
  char draft_clone[CLONE_SZ+1];
  int ctg;
  int index;
  int start;                    /* Start of hit w.r.t. seqctg*/
  int end;                      /* End of hit w.r.t. seqctg*/
  int rc;
  int x;                        /* Left fpc clone coordinate*/
  int y;                        /* Right fpc clone coordinate*/
  int i_bss;                     /* index of this hit in the bss table */
};

struct seqctg{
  struct hit *hits;
  int numhits;
  int maxhits;
};

struct seqctg *seqctgs=NULL;

int maxseqctg=0;

typedef struct
{
    gint seq_ctg;
    struct hit *hits[2];
} PairHitsData_t;


static void
get_hits_mem(struct hit **p, int numhits, int *maxhits){
  if(numhits >= *maxhits-1){
    if(*maxhits==0){
      *maxhits=100;
      *p=calloc(*maxhits,sizeof(struct hit));
      NOMEM2(*p,"hits");
    }
    else{
      *maxhits+=100;
      *p=realloc(*p,sizeof(struct hit)*(*maxhits));
      NOMEM2(*p,"hits (realloc)");
    }
  }
}

static void
add_to_seqctgs(int seq_ctg, const char* draft_clone, const char *bes, const char *clone,
               int ctg, int start, int end, int rc, int i_bss){
  int index;
  struct hit* h;
  CLONE *clp;

  if(seqctgs==NULL){
    printf("WARNING:  seqctg array memory not allocated!!\n");
    return;
  }

  if(!fppFind(acedata, (char*)clone, &index, cloneOrder)){
    printf("WARNING: %s in bss file not in FPC!!\n", clone);
    return;
  }

  if(seqctgs[seq_ctg].numhits>=seqctgs[seq_ctg].maxhits)
    get_hits_mem(&(seqctgs[seq_ctg].hits), seqctgs[seq_ctg].numhits,
                                          &(seqctgs[seq_ctg].maxhits));

  clp = arrp(acedata, index, CLONE);
  h = &seqctgs[seq_ctg].hits[seqctgs[seq_ctg].numhits];
  h->x = clp->x;
  h->y = clp->y;
  h->index = index;

  strcpy(h->bes, bes);
  strcpy(h->clone, clone);
  strncpy(h->draft_clone, draft_clone,CLONE_SZ);
  h->ctg = ctg;
  h->start = start;
  h->end = end;
  h->rc = rc;
  h->i_bss = i_bss;

  seqctgs[seq_ctg].numhits++;
}

int good_hits, reject_singleton, reject_score, reject_id, reject_both, false_hit_cnt;


static int
load_bss_file(char *filename, int minscore, int minid, BSS_INST *pbsi)
{
    BSS_HIT* ph;
    int i;
    int has_score = 0;
    int has_id = 0;
    int has_tlen = 0, has_tstart = 0, has_tend = 0;
    int has_qlen = 0, has_qstart = 0, has_qend = 0;
    int has_rc = 0;

    if (!read_bss_file(filename,pbsi))
    {
        return 0;
    }      

    /* see if the bss file has score and/or %id */
    
    for (i = 0; i < bss_ncols; i++)
    {
        if (BSS_SCORE == bss_column_order[i]) has_score = 1;
        if (BSS_IDENTITY == bss_column_order[i]) has_id = 1;
        if (BSS_RC == bss_column_order[i]) has_rc = 1;

        if (BSS_QLEN == bss_column_order[i])    has_qlen = 1;
        if (BSS_QSTART == bss_column_order[i])  has_qstart = 1;
        if (BSS_QEND == bss_column_order[i])    has_qend = 1;

        if (BSS_TLEN == bss_column_order[i])    has_tlen = 1;
        if (BSS_TSTART == bss_column_order[i])  has_tstart = 1;
        if (BSS_TEND == bss_column_order[i])    has_tend = 1;

    }

    if (!has_score) printf("BSS file does not contain a hit score column; ignoring min score setting\n");fflush(0);
    if (!has_id) printf("BSS file does not contain a %%id column; ignoring min %%id setting\n");fflush(0);

//    if (!has_qlen) printf("BSS file cannot be used because it does not contain a Query_len column\n");fflush(0);
    if (!has_qstart) printf("BSS file cannot be used because it does not contain a Query_start column\n");fflush(0);
    if (!has_qend) printf("BSS file cannot be used because it does not contain a Query_end column\n");fflush(0);

//    if (!has_tlen) printf("BSS file cannot be used because it does not contain a Targ_len column\n");fflush(0);
//    if (!has_tstart) printf("BSS file cannot be used because it does not contain a Targ_start column\n");fflush(0);
//    if (!has_tend) printf("BSS file cannot be used because it does not contain a Targ_end column\n");fflush(0);

    if (!has_rc) printf("BSS file cannot be used because it does not contain an RC column\n");fflush(0);

    maxseqctg = arrayMax(pbsi->bss_queryhits);
    seqctgs = (struct seqctg*)malloc(sizeof(struct seqctg)*(maxseqctg + 1));
    memset(seqctgs,0,sizeof(struct seqctg)*(maxseqctg + 1));

    for (i = 0; i < arrayMax(pbsi->bss_hits); i++)
    {
        ph = arrp(pbsi->bss_hits,i,BSS_HIT);
        ph->filtered = 1; // set to zero in find_good_pairs if good
        if (has_id && ph->data[BSS_IDENTITY].i < minid) 
        {
            reject_id++;
            if (has_score && ph->data[BSS_SCORE].i < minscore) 
            {   
                reject_score++;
                reject_both++;
            }
            continue; 
        }                       
        if (has_score && ph->data[BSS_SCORE].i < minscore) 
        {
            reject_score++;
            continue;  
        }
        good_hits++;
 
        add_to_seqctgs( ph->data[BSS_SEQCTG].i, 
						arrp(pbsi->bss_queryhits,ph->data[BSS_SEQCTG].i,QUERY_HIT_DATA)->query, // name of the draft contig
                        ph->data[BSS_BES].str, // subject
                        ph->data[BSS_CLONE].str,
                        ph->data[BSS_CTG].i,
                        ph->data[BSS_QSTART].i,
                        ph->data[BSS_QEND].i,
                        (ph->data[BSS_RC].str[0] == 'y' ? 1 : 0),
                        i
                    ); 
                       

    }

    return 1;
}

static void
remove_seqctg(int index){
  g_free(seqctgs[index].hits);
  seqctgs[index].hits=NULL;
  seqctgs[index].numhits=0;
  seqctgs[index].maxhits=0;
}

static void
remove_hit(int seqctg, int index){
  int i;

  for(i=index; i<seqctgs[seqctg].numhits-1; i++){
    seqctgs[seqctg].hits[i] = seqctgs[seqctg].hits[i+1];
  }
  if(--seqctgs[seqctg].numhits == 0) remove_seqctg(seqctg);
}

static int
sortHits(struct hit *p1, struct hit *p2){
  if(p1->x < p2->x) return -1;
  if(p1->x > p2->x) return 1;
  return 0;
}

struct hitCtg{
  int ctg;
  int num_hits;
};

static int
sortHitCtgs(struct hitCtg *p1, struct hitCtg *p2){
  if(p1->num_hits < p2->num_hits) return 1;
  if(p1->num_hits > p2->num_hits) return -1;
  return 0;
}

static int
mult_ctgs(int seqctg){
  struct hit *hits;
  int cnt;
  struct hitCtg hit_ctgs[10];
  int hit_ctg_num=0;
  int i,j;
  int found;
  int best_ctg;
  int best_ctg_hits=0;
  int other_ctg_hits=0;
  int best_to_other_ratio;

  hits=seqctgs[seqctg].hits;
  cnt=seqctgs[seqctg].numhits;

  for(i=0;i<cnt;i++){
    found=FALSE;
    for(j=0;j<hit_ctg_num && !found;j++){
      if(hits[i].ctg==hit_ctgs[j].ctg){
        hit_ctgs[j].num_hits++;
        found=TRUE;
      }
    }
    if(!found){
      hit_ctgs[hit_ctg_num].ctg=hits[i].ctg;
      hit_ctgs[hit_ctg_num].num_hits=1;
      hit_ctg_num++;
    }
    if(hit_ctg_num==10) return TRUE;
  }
  qsort(hit_ctgs, hit_ctg_num, sizeof(struct hitCtg), (void *)sortHitCtgs);
  if(hit_ctg_num>1){
    /* Allow a specified ratio of best ctg hits to other ctg hits*/
    best_ctg=hit_ctgs[0].ctg;
    best_ctg_hits=hit_ctgs[0].num_hits;
    for(i=1;i<hit_ctg_num;i++){
      other_ctg_hits+=hit_ctgs[i].num_hits;
    }
    best_to_other_ratio=best_ctg_hits/other_ctg_hits;
    if(best_to_other_ratio >= mult_ctg_ratio){
      /*Remove hits to other contigs*/
      for(i=0;i<seqctgs[seqctg].numhits;i++){
        if(hits[i].ctg!=best_ctg){
          remove_hit(seqctg, i);
          i--; /* Remaining hits have shifted left*/
          false_hit_cnt++;
          good_hits--;
        }
      }
      return FALSE;
    }
    return TRUE;
  }
  return FALSE;
}

static int
remove_multiple_bes_hits(int seq_ctg, char *bes){
  int i, j;
  int bad_cnt=0;

  for(i=seq_ctg+1;i<=maxseqctg;i++){
    for(j=0;j<seqctgs[i].numhits;j++){
      if(!strcmp(seqctgs[i].hits[j].bes, bes)){
        /* We found a duplicated BES*/
        remove_hit(i, j);
        good_hits--;
        bad_cnt++;
        j--; /* Remaining hits have shifted left*/
      }
    }
  }
  return bad_cnt;
}

static int
good_pair(struct hit *p1, struct hit *p2, int* seq_olap){
  int fpc_olap;

  if(p1->ctg != p2->ctg) return -5;

  if(p1->rc == p2->rc) return 0;
  
  if (p1->rc == 1) {
    *seq_olap= p1->end - p2->start;
  }
  else {
    *seq_olap= p2->end - p1->start;
  }
  fpc_olap=(p1->y - p2->x + 1);

  if(only_pos_olap && *seq_olap<0) return -1;
  if(abs(*seq_olap) > max_seq_olap) return -2;
  if(fpc_olap < minOlap) return -3;
  if(fpc_olap > maxOlap) return -4;
  return 1;
}

//#define WRITE_CHECK_FILE

static void
find_good_pairs(BSS_INST* pbsi)
{
    int i, j, k;
    int found_dup=FALSE;
    int bad_hits_mult=0, reject_mult_seqctg=0;
    int mult_cnt=0, good_cnt=0, reject_rc=0, reject_pos_olap=0,
        reject_max_olap=0, reject_max_fpc_olap=0, reject_min_fpc_olap=0, 
        reject_ctg=0;
    int ret_val;
    CLONE *clpL, *clpR;
    int sizeL, sizeR;
    int sizeL_from_file, sizeR_from_file;
    char str1[MAXPATHLEN+100];
    int have_sizes_dir=FALSE;

#ifdef WRITE_CHECK_FILE
    FILE *fp = fopen("mtp.cln", "w");
#endif

    false_hit_cnt=0;

    if (strlen(dirName) > 0)
        sprintf(str1,"%s/Sizes",dirName);
    else
        sprintf(str1,"./Sizes");
    if (filCheck(str1,"d") && Use_sizes)
        have_sizes_dir = TRUE;

    if (only_single_bes) {
        /* Remove hits where two seqCtgs hit a common BES*/
        for (i = 0; i <= maxseqctg; i++) {
            found_dup = FALSE;
            for (j = 0; j < seqctgs[i].numhits; j++) {
                if ((k = remove_multiple_bes_hits(i, seqctgs[i].hits[j].bes))
                    != 0) {
                    reject_mult_seqctg += k;
                    remove_hit(i, j);
                    good_hits--;
                    reject_mult_seqctg++;
                    j--;  /*Remaining hits have shifted left*/
                }
            }
        }
    }

    /* WMN 7/08  Make this an option so pseudomolecule ref sequence can be used */
    if (mult_ctg_ratio > 0)
    {
        /* Remove seqctgs hitting in multiple contigs*/
        /* NOTE mult_ctgs has a side effect, removing hits in the non-best contigs */
        for (i = 0; i <= maxseqctg; i++) {
            if (seqctgs[i].numhits > 0) {
                if (mult_ctgs(i)) {
                    bad_hits_mult += seqctgs[i].numhits;
                    remove_seqctg(i);
                    mult_cnt++;
                }
            }
        }
    }

    /* Sort all seqCtg hits by clone's left end value*/
    for (i = 0; i <= maxseqctg; i++) {
        qsort(seqctgs[i].hits, seqctgs[i].numhits,
              sizeof(struct hit), (void *)sortHits);
    }

    /* Find all good pairs produced by the seqctg*/
    for (i = 0; i <= maxseqctg; i++) {
        for (j = 0; j < seqctgs[i].numhits; j++) {
            for (k = j+1; k < seqctgs[i].numhits; k++) {
                int bssOlap;
                if ((ret_val = good_pair(&(seqctgs[i].hits[j]),
                                         &(seqctgs[i].hits[k]),&bssOlap)) == 1) {
                    sizeL=sizeR=0;
                    sizeL_from_file=sizeR_from_file=FALSE;
                    clpL=arrp(acedata,seqctgs[i].hits[j].index, CLONE);
                    clpR=arrp(acedata,seqctgs[i].hits[k].index, CLONE);
                    if (have_sizes_dir) {
                        sizeL=GetSize(clpL);
                        sizeR=GetSize(clpR);
                    }
                    if (sizeL==0)
                        sizeL=clpL->fp->b2*Proj.avgbandsize;
                    else
                        sizeL_from_file=TRUE;

                    if (sizeR==0)
                        sizeR=clpR->fp->b2*Proj.avgbandsize;
                    else
                        sizeR_from_file=TRUE;


                    add_to_pairs(seqctgs[i].hits[j].ctg, BSS_DRAFT,
                                 seqctgs[i].hits[j].clone,
                                 seqctgs[i].hits[k].clone,
                                 seqctgs[i].hits[k].draft_clone, NULL, 
                                 seqctgs[i].hits[j].bes, seqctgs[i].hits[k].bes,
                                 seqctgs[i].hits[j].x, seqctgs[i].hits[j].y,
                                 seqctgs[i].hits[k].x, seqctgs[i].hits[k].y,
                                 seqctgs[i].hits[j].index,
                                 seqctgs[i].hits[k].index,
                                 i, -1, -1,
                                 bssOlap, bssOlap, 
                                 sizeL, sizeR, sizeL_from_file,
                                 sizeR_from_file, NULL);

                    arrp(pbsi->bss_hits,seqctgs[i].hits[j].i_bss,BSS_HIT)->filtered = 0;
                    arrp(pbsi->bss_hits,seqctgs[i].hits[k].i_bss,BSS_HIT)->filtered = 0;


#ifdef WRITE_CHECK_FILE
                    fprintf(fp, "%d\t%s\t%s\n", i,
                            seqctgs[i].hits[j].bes, seqctgs[i].hits[k].bes);
#endif
                    good_cnt++;
                }
                else if (ret_val==0)  reject_rc++;
                else if (ret_val==-1) reject_pos_olap++;
                else if (ret_val==-2) reject_max_olap++;
                else if (ret_val==-3) reject_min_fpc_olap++;
                else if (ret_val==-4) reject_max_fpc_olap++;
                else if (ret_val==-5) reject_ctg++;
            }
        }

    }
#ifdef WRITE_CHECK_FILE
    fclose(fp);
#endif

    printf("Hit rejections:\n");
    printf("  %-5d singleton\n", reject_singleton);
    printf("  %-5d min ID and min score\n", reject_both);
    printf("  %-5d only min ID\n", reject_id);
    printf("  %-5d only min score\n", reject_score);
    if(only_single_bes) printf("  %-5d multiple seqCtgs hitting same BES\n", reject_mult_seqctg);
    printf("  %-5d not in best contig\n", false_hit_cnt);
    printf("  %-5d seqCtg hit too many ctgs (%d seqCtgs)\n\n", bad_hits_mult, mult_cnt);
    printf("Total good hits: %d\n\n", good_hits);

    printf("Pair rejections:\n");
    printf("  %-5d same orientation\n", reject_rc);
    if(only_pos_olap) printf("  %-5d negative overlap\n", reject_pos_olap);
    printf("  %-5d too much sequence overlap\n", reject_max_olap);
    printf("  %-5d below minimum fpc overlap\n", reject_min_fpc_olap);
    printf("  %-5d above maximum fpc overlap\n", reject_max_fpc_olap);
    printf("  %-5d different contigs\n\n", reject_ctg);
    printf("Total good pairs: %d\n\n", good_cnt);
    fflush(0);
    return;
}

int
compute_pairs(char *filename)
{
    BSS_INST bsi;
    char bss_path[MAXPATHLEN];
    char* ptr;

    if (load_bss_file(filename, minScore, minId, &bsi)) 
    {
        /* make path for the bss file of good pair hits we will output */
        strcpy(bss_path,bsi.bssfile);
        ptr = strrchr(bss_path,'/');
        if (ptr)
        {
            ptr++;
            *ptr = 0;
            strcat(bss_path,"mtp_pairs.bss");
        }
        else
        {
            strcpy(bss_path,"./mtp_pairs.bss");
        }                
        find_good_pairs(&bsi);
        strcpy(bsi.bssfile,bss_path);
        write_bss_file(&bsi,0);

        destroy_bss_instance(&bsi);

        return 1;
    }

    return 0;
}
